In [19]:
import pandas as pd
data = pd.read_csv("TravelInsurancePrediction.csv")
data.head()
Out[19]:
| Unnamed: 0 | Age | Employment Type | GraduateOrNot | AnnualIncome | FamilyMembers | ChronicDiseases | FrequentFlyer | EverTravelledAbroad | TravelInsurance | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 31 | Government Sector | Yes | 400000 | 6 | 1 | No | No | 0 |
| 1 | 1 | 31 | Private Sector/Self Employed | Yes | 1250000 | 7 | 0 | No | No | 0 |
| 2 | 2 | 34 | Private Sector/Self Employed | Yes | 500000 | 4 | 1 | No | No | 1 |
| 3 | 3 | 28 | Private Sector/Self Employed | Yes | 700000 | 3 | 1 | No | No | 0 |
| 4 | 4 | 28 | Private Sector/Self Employed | Yes | 700000 | 8 | 1 | Yes | No | 0 |
In [20]:
data.drop(columns=["Unnamed: 0"], inplace=True)
In [21]:
data.isnull().sum()
Out[21]:
Age 0 Employment Type 0 GraduateOrNot 0 AnnualIncome 0 FamilyMembers 0 ChronicDiseases 0 FrequentFlyer 0 EverTravelledAbroad 0 TravelInsurance 0 dtype: int64
In [22]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1987 entries, 0 to 1986 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1987 non-null int64 1 Employment Type 1987 non-null object 2 GraduateOrNot 1987 non-null object 3 AnnualIncome 1987 non-null int64 4 FamilyMembers 1987 non-null int64 5 ChronicDiseases 1987 non-null int64 6 FrequentFlyer 1987 non-null object 7 EverTravelledAbroad 1987 non-null object 8 TravelInsurance 1987 non-null int64 dtypes: int64(5), object(4) memory usage: 139.8+ KB
In [23]:
data["TravelInsurance"] = data["TravelInsurance"].map({0: "Not Purchased", 1: "Purchased"})
In [55]:
import plotly.express as px
figure = px.histogram(data, x = "Employment Type",
color = "TravelInsurance",
title= "Factors Affecting Purchase of Travel Insurance: Employment Type")
figure.show()
In [56]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
data = data
figure = px.histogram(data, x = "Age",
color = "TravelInsurance",
title= "Factors Affecting Purchase of Travel Insurance: Age")
figure.show()
In [54]:
import plotly.express as px
data = data
figure = px.histogram(data, x = "AnnualIncome",
color = "TravelInsurance",
title= "Factors Affecting Purchase of Travel Insurance: Income")
figure.show()
In [27]:
import numpy as np
data["GraduateOrNot"] = data["GraduateOrNot"].map({"No": 0, "Yes": 1})
data["FrequentFlyer"] = data["FrequentFlyer"].map({"No": 0, "Yes": 1})
data["EverTravelledAbroad"] = data["EverTravelledAbroad"].map({"No": 0, "Yes": 1})
x = np.array(data[["Age", "GraduateOrNot",
"AnnualIncome", "FamilyMembers",
"ChronicDiseases", "FrequentFlyer",
"EverTravelledAbroad"]])
y = np.array(data[["TravelInsurance"]])
In [28]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
predictions = model.predict(xtest)
In [29]:
from sklearn.metrics import accuracy_score
print(accuracy_score(ytest,predictions))
0.8090452261306532
In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
In [31]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=6, n_estimators=100, oob_score=True)
rf.fit(xtrain, ytrain)
C:\Users\Nicholas Bagwandeen\AppData\Local\Temp\ipykernel_35044\3342771140.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
Out[31]:
RandomForestClassifier(max_depth=6, n_jobs=-1, oob_score=True, random_state=42)
In [32]:
rf.oob_score_
Out[32]:
0.8316554809843401
In [33]:
y_pred = rf.predict(xtest)
print(accuracy_score(ytest, y_pred))
0.8592964824120602
In [34]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
In [35]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = [1,2,3,4,5]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4, 5,6,7,8,9, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2,3, 4,5,6,7,8,9,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
'max_features': [1, 2, 3, 4, 5],
'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000]}
In [36]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(xtrain, ytrain)
Fitting 3 folds for each of 500 candidates, totalling 1500 fits
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:926: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
Out[36]:
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=500,
n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60,
70, 80, 90, 100, 110,
None],
'max_features': [1, 2, 3, 4, 5],
'min_samples_leaf': [1, 2, 3, 4, 5, 6,
7, 8, 9, 10],
'min_samples_split': [2, 3, 4, 5, 6, 7,
8, 9, 10],
'n_estimators': [100, 311, 522, 733,
944, 1155, 1366, 1577,
1788, 2000]},
random_state=42, verbose=2)
In [37]:
rf_random.best_params_
Out[37]:
{'n_estimators': 1366,
'min_samples_split': 4,
'min_samples_leaf': 2,
'max_features': 2,
'max_depth': 10,
'bootstrap': True}
In [38]:
y_pred = rf_random.predict(xtest)
In [39]:
accuracy_score(ytest, y_pred)
Out[39]:
0.8542713567839196
In [40]:
from sklearn.model_selection import GridSearchCV# Create the parameter grid based on the results of random search
param_grid = {
'bootstrap': [True],
'max_depth': [8, 9, 10, 11, 12],
'max_features': [1 ,2, 3],
'min_samples_leaf': [1, 2, 3, 4, 5],
'min_samples_split': [2,3,4,5,6],
'n_estimators': [1200,1250,1300,1350,1400]
}# Create a based model
rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
cv = 3, n_jobs = -1, verbose = 2)
In [41]:
# Fit the grid search to the data
grid_search.fit(xtrain, ytrain)
grid_search.best_params_
Fitting 3 folds for each of 1875 candidates, totalling 5625 fits
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:926: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
Out[41]:
{'bootstrap': True,
'max_depth': 9,
'max_features': 3,
'min_samples_leaf': 2,
'min_samples_split': 4,
'n_estimators': 1200}
In [43]:
rf = RandomForestClassifier(bootstrap=True, max_depth=9, max_features=3, min_samples_leaf=2, min_samples_split=4, n_estimators=1200)
rf.fit(xtrain,ytrain)
y_pred = rf.predict(xtest)
accuracy_score(ytest, y_pred)
C:\Users\Nicholas Bagwandeen\AppData\Local\Temp\ipykernel_35044\2290841162.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
Out[43]:
0.8542713567839196
In [ ]:
In [ ]: